Filtering criteria:

# Load Gandal's Dataset
if(file.exists('./../Data/Gandal_RNASeq.RData')){
  
  load('./../Data/Gandal_RNASeq.RData')
  
  datExpr_Gandal = datExpr %>% dplyr::select(which(colnames(datExpr) %in% rownames(datMeta[datMeta$Diagnosis_=='CTL',])))
  datMeta_Gandal = datMeta %>% filter(Diagnosis_=='CTL')
  datProbes_Gandal = datProbes
  DE_info_Gandal = DE_info
  
} else print('Gandal_RNASeq.RData does not exist. Find how to create it in 04_26_Gandal_RNASeq_exploratory_analysis.RData')

# Load BrainSpan's Dataset
if(file.exists('./../Data/BrainSpan_filtered.RData')){
  
  load('./../Data/BrainSpan_filtered.RData')
  
  datExpr_BrainSpan = datExpr
  datMeta_BrainSpan = datMeta
  datProbes_BrainSpan = datProbes
  DE_info_BrainSpan = DE_info
  
} else print('BrainSpan_raw.RData does not exist. Find how to create it in 05_14_BrainSpan_exploratory_analysis.RData')

rm(datExpr, datMeta, datProbes, DE_info)

# Probe comparison
paste0('After filtering, Gandal has ', nrow(datExpr_Gandal),' probes and BrainSpan has ', 
       nrow(datExpr_BrainSpan),', of which they share ',
       sum(rownames(datExpr_Gandal) %in% rownames(datExpr_BrainSpan)))
## [1] "After filtering, Gandal has 33478 probes and BrainSpan has 38790, of which they share 28654"
all_probes = unique(c(rownames(datExpr_Gandal), rownames(datExpr_BrainSpan)))

DE_genes_df = data.frame('Gandal' = all_probes %in% rownames(datExpr_Gandal),
                         'BrainSpan' = all_probes %in% rownames(datExpr_BrainSpan))
rownames(DE_genes_df) = all_probes

plot(venneuler(DE_genes_df))

# Filter to keep only common probes
datExpr_Gandal = datExpr_Gandal[rownames(datExpr_Gandal) %in% rownames(datExpr_BrainSpan),]
datProbes_Gandal = datProbes_Gandal[rownames(datProbes_Gandal) %in% rownames(datExpr_Gandal),]

datExpr_BrainSpan = datExpr_BrainSpan[rownames(datExpr_BrainSpan) %in% rownames(datExpr_Gandal),]
datProbes_BrainSpan = datProbes_BrainSpan[rownames(datProbes_BrainSpan) %in% rownames(datExpr_BrainSpan),]

# write.csv(rownames(datExpr_Gandal), file='./../Data/Gandal_BrainSpan_probes.csv', row.names=FALSE)

rm(all_probes, DE_genes_df)


Normalise Gandal dataset

Normalisation method: voom for variance stabilisation

According to https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/thursday/DE.html, if the voom plot looks like the plot below, we should increase the filtering threshold for low expressed genes

v = voom(datExpr_Gandal, plot=TRUE)

rm(v)

Filtering all genes with mean expression lower than 1

to_keep = rowMeans(datExpr_Gandal)>1

datExpr_Gandal = datExpr_Gandal[to_keep,]
datExpr_BrainSpan = datExpr_BrainSpan[to_keep,]

datProbes_Gandal = datProbes_Gandal[to_keep,]
datProbes_BrainSpan = datProbes_BrainSpan[to_keep,]

print(paste0('Keeping ', sum(to_keep), ' genes'))
## [1] "Keeping 25771 genes"

This is supposed to be a good result for voom

datExpr_Gandal = voom(datExpr_Gandal, plot=TRUE)

Mean vs SD

Data is not homoscedastic, same as with the simple log2 transformation, the genes with the highest mean seems to be the most affected by the transformation

plot_data = data.frame('ID'=rownames(datExpr_Gandal$E), 'mean'=rowMeans(datExpr_Gandal$E), 'sd'=apply(datExpr_Gandal$E,1,sd)) %>%
            left_join(SFARI_genes, by='ID') %>% mutate('gene-score' = factor(`gene-score`))

ggplotly(plot_data %>% ggplot(aes(mean, sd, color=`gene-score`)) + geom_point(alpha=0.2) + ggtitle('Gandal dataset') + theme_minimal())
rm(plot_data)
datExpr_BrainSpan = log2(datExpr_BrainSpan+1)

plot_data = data.frame('ID'=rownames(datExpr_BrainSpan), 'mean'=rowMeans(datExpr_BrainSpan), 'sd'=apply(datExpr_BrainSpan,1,sd)) %>%
            left_join(SFARI_genes, by='ID') %>% mutate('gene-score' = factor(`gene-score`))

ggplotly(plot_data %>% ggplot(aes(mean, sd, color=`gene-score`)) + geom_point(alpha=0.2) + ggtitle('BrainSpan dataset') + theme_minimal())
rm(plot_data)


Create artificial binary classifier

Assigning a label to each subject, so different samples from the same subject share class, as it would happen with real ASD samples

Gandal dataset:

fake_class = data.frame('Subject_ID' = unique(datMeta_Gandal$Subject_ID),
                        'fake' = rbinom(length(unique(datMeta_Gandal$Subject_ID)), 1, 0.5))
datMeta_Gandal = datMeta_Gandal %>% left_join(fake_class, by='Subject_ID')

table(datMeta_Gandal$fake)
## 
##  0  1 
## 18 17

BrainSpan dataset:

fake_class = data.frame('donor_id' = unique(datMeta_BrainSpan$donor_id),
                        'fake' = rbinom(length(unique(datMeta_BrainSpan$donor_id)), 1, 0.5))
datMeta_BrainSpan = datMeta_BrainSpan %>% left_join(fake_class, by='donor_id')

table(datMeta_BrainSpan$fake)
## 
##   0   1 
## 145 156


Perform DEA

Following Gandal’s preprocessing script (limma lmFit), removing robust parameter to improve running time for BrainSpan

# Gandal dataset:
mod = model.matrix(~fake, data=datMeta_Gandal)
corfit = duplicateCorrelation(datExpr_Gandal, mod, block=datMeta_Gandal$Subject_ID)
lmfit = lmFit(datExpr_Gandal, design=mod, block=datMeta_Gandal$Subject_ID, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_Gandal))
new_DE_info_Gandal = top_genes[match(rownames(datExpr_Gandal), rownames(top_genes)),]

new_DE_info_Gandal$signif_p_val = new_DE_info_Gandal$adj.P.Val<0.05
new_DE_info_Gandal$signif_lfc = abs(new_DE_info_Gandal$logFC)>log2(1.2)
new_DE_info_Gandal$signif = new_DE_info_Gandal$signif_p_val & new_DE_info_Gandal$signif_lfc

new_DE_info_Gandal = new_DE_info_Gandal %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
                     dplyr::select(ID, logFC, AveExpr, P.Value, adj.P.Val, signif_p_val, signif_lfc, signif,
                                   `gene-score`, syndromic) %>% mutate(`gene-score` = as.factor(`gene-score`))

rm(mod, corfit, lmfit, fit, top_genes)
# BrainSpan dataset:
mod = model.matrix(~fake, data=datMeta_BrainSpan)
corfit = duplicateCorrelation(datExpr_BrainSpan, mod, block=datMeta_BrainSpan$donor_id)
lmfit = lmFit(datExpr_BrainSpan, design=mod, block=datMeta_BrainSpan$donor_id, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_BrainSpan))
new_DE_info_BrainSpan = top_genes[match(rownames(datExpr_BrainSpan), rownames(top_genes)),]

new_DE_info_BrainSpan$signif_p_val = new_DE_info_BrainSpan$adj.P.Val<0.05
new_DE_info_BrainSpan$signif_lfc = abs(new_DE_info_BrainSpan$logFC)>log2(1.2)
new_DE_info_BrainSpan$signif = new_DE_info_BrainSpan$signif_p_val & new_DE_info_BrainSpan$signif_lfc

new_DE_info_BrainSpan = new_DE_info_BrainSpan %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
                        dplyr::select(ID, logFC, AveExpr, P.Value, adj.P.Val, signif_p_val, signif_lfc, signif,
                                      `gene-score`, syndromic) %>% mutate(`gene-score` = as.factor(`gene-score`))


rm(mod, corfit, lmfit, fit, top_genes)


Compare logFC by SFARI score

The relation between SFARI score and logFC weakens, maybe even disappears.

p1 = ggplotly(new_DE_info_Gandal %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() + 
              scale_color_manual(values=gg_colour_hue(9)) + theme_minimal() + theme(legend.position = 'none') +
              ggtitle('logFC in Gandal dataset (left) and in BrainSpan (right)'))

p2 = ggplotly(new_DE_info_BrainSpan %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() + 
              scale_color_manual(values=gg_colour_hue(9)) + theme_minimal() + theme(legend.position = 'none') +
              ggtitle('logFC in Gandal dataset (left) and in BrainSpan (right)'))

subplot(p1, p2, nrows=1)
rm(p1, p2)

There isn’t a strong correlation between logFC in the two datasets

plot_data = new_DE_info_Gandal %>% left_join(new_DE_info_BrainSpan, by='ID') %>% 
            mutate('gene-score'=as.factor(ifelse(is.na(`gene-score.x`), 'NA', `gene-score.x`)))

corr = cor(plot_data$logFC.x, plot_data$logFC.y)
ggplotly(plot_data %>% ggplot(aes(logFC.x, logFC.y, color=`gene-score`)) + geom_point(alpha=0.2) + 
         geom_hline(yintercept=log2(1.2), color='#cc0099') + geom_hline(yintercept=-log2(1.2), color='#cc0099') + 
         geom_vline(xintercept=log2(1.2), color='#cc0099') + geom_vline(xintercept=-log2(1.2), color='#cc0099') +
         ggtitle(paste0('logFC comparison by gene corr=', round(corr,4))) + scale_color_manual(values=gg_colour_hue(9)) + 
         xlab('Gandal') + ylab('BrainSpan') + theme_minimal())
rm(corr)
for(score in sort(unique(plot_data$`gene-score`))){
  corr = cor(plot_data$logFC.x[plot_data$`gene-score`==score], plot_data$logFC.y[plot_data$`gene-score`==score])
  print(paste0('Correlation = ', round(corr,4), ' for SFARI score ', score))
}
## [1] "Correlation = 0.6383 for SFARI score 1"
## [1] "Correlation = 0.3934 for SFARI score 2"
## [1] "Correlation = 0.3662 for SFARI score 3"
## [1] "Correlation = 0.3264 for SFARI score 4"
## [1] "Correlation = 0.4276 for SFARI score 5"
## [1] "Correlation = 0.2439 for SFARI score 6"
## [1] "Correlation = 0.1726 for SFARI score NA"

Concordance in significance of genes based in log Fold Change

table(plot_data$signif_lfc.x, plot_data$signif_lfc.y)/nrow(plot_data)*100
##        
##             FALSE      TRUE
##   FALSE 65.981142  4.148073
##   TRUE  28.171200  1.699585


For both datasets, almost no genes have a significant adjusted p-value (makes sense because the classifier has no biological meaning)

summary(plot_data$signif_p_val.x)
##    Mode   FALSE 
## logical   25771
summary(plot_data$signif_p_val.y)
##    Mode   FALSE 
## logical   25771